Preload things.
Dictionary:
# Frame of scores
ig_frame <- fread("E:/Laurae/NumericCMI_exact_best_grid/IG_2216feat.csv", data.table = FALSE)
ig_frame <- data.frame(Feature = ig_frame$Feature,
Type = ig_frame$Type,
Levels = ig_frame$Levels,
Splits = rep(0, 2216),
NotMissing = ig_frame$NotMissing,
IG = ig_frame$IG,
IV = rep(0, 2216),
RankIG = ig_frame$RankIG,
RankIV = rep(0, 2216), stringsAsFactors = FALSE,
Number = 1:2216)
# Folds for CV
folds <- numeric(1183747)
for (i in 1:5) {
folds[fread(paste("E:/Laurae/folds/folds0", i, ".csv", sep = ""))$x] <- i
}
Formulas:
gc(verbose = TRUE)
Garbage collection 90 = 37+8+45 (level 2) ...
89.3 Mbytes of cons cells used (63%)
18438.7 Mbytes of vectors used (62%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1671949 89.3 2637877 140.9 2637877 140.9
Vcells 2416793867 18438.7 3910687954 29836.2 2420182705 18464.6
large_data <- data.frame(variable = character(),
class = character(),
pct_bin = numeric(),
good = numeric(),
bad = numeric(),
bad_rate = numeric(),
inflation = numeric(),
pct_good = numeric(),
pct_bad = numeric(),
odds = numeric(),
woe = numeric(),
miv = numeric(),
stringsAsFactors = FALSE)
default <- data.frame(variable = character(),
class = character(),
pct_bin = numeric(),
good = numeric(),
bad = numeric(),
bad_rate = numeric(),
inflation = numeric(),
pct_good = numeric(),
pct_bad = numeric(),
odds = numeric(),
woe = numeric(),
miv = numeric(),
stringsAsFactors = FALSE)
# Magic function to coerce super quickly
setDF <- function(x) {
if (!is.data.table(x))
stop("x must be a data.table")
setattr(x, "row.names", .set_row_names(nrow(x)))
setattr(x, "class", "data.frame")
setattr(x, "sorted", NULL)
setattr(x, ".internal.selfref", NULL)
}
StartTime <- System$currentTimeMillis()
cat("Job started on ", format(Sys.time(), "%a %b %d %Y %X"), ". \n", sep = "")
Job started on Thu Oct 13 2016 08:19:58 PM.
pb <- winProgressBar(title = "Mutual Information computation", label = paste("[", format(Sys.time(), "%a %b %d %Y %X"), "] Preparing computation...", sep = ""), min = 0, max = 2216, initial = 0, width = 520)
cores <- 6
choose <- cores * 5 # 5 times per core per batch
mcl <- makeCluster(cores)
clusterExport(mcl, c("folds"))
invisible(clusterEvalQ(mcl, library("woe")))
invisible(clusterEvalQ(mcl, library("data.table")))
invisible(clusterEvalQ(mcl, library("rpart")))
registerDoParallel(cl = mcl)
# Do loop
for (i in 1:ceiling(2216 / choose)) {
# Prepare parallel loop
mini_temp <- data[, ((i - 1) * choose + 1):min((i * choose), 2216), with = FALSE]
mini_temp$target <- target
setDF(mini_temp)
clusterExport(mcl, c("mini_temp"))
# Parallel loop setup
whatever <- foreach(j = 1:((min((i * choose), 2216) - ((i - 1) * choose + 1)) + 1), .combine = "rbind", .init = default, .inorder = TRUE, .noexport = c("mini_temp", "folds")) %dopar% {
whatever <- iv.mult(df = mini_temp,
y = "target",
vars = colnames(mini_temp)[j],
sql = FALSE,
topbin = TRUE,
tbpct = 0.0001,
verbose = FALSE,
rcontrol = rpart.control(minsplit = 100, cp = 0.00001, xval = folds))
if (length(whatever) == 0) {
whatever <- data.frame(variable = colnames(mini_temp)[j],
class = "",
pct_bin = 0,
good = 0,
bad = 0,
bad_rate = 0,
inflation = 0,
pct_good = 0,
pct_bad = 0,
odds = 0,
woe = 0,
miv = 0,
stringsAsFactors = FALSE)
} else {
whatever <- as.data.frame(whatever[[1]])
}
return(whatever)
}
invisible(clusterEvalQ(mcl, gc(verbose = FALSE)))
# Harvest statistics
# Do IV
whatever2 <- merge(aggregate(whatever$miv, by = list(Feature = whatever$variable), FUN = sum), ig_frame[, c("Feature", "Number")], by = "Feature")
ig_frame$IV[whatever2$Number] <- whatever2$x
# Do Splits
whatever2 <- merge(aggregate(whatever$miv, by = list(Feature = whatever$variable), FUN = length), ig_frame[, c("Feature", "Number")], by = "Feature")
ig_frame$Splits[whatever2$Number] <- whatever2$x
large_data <- rbind(large_data, whatever)
cat("Batch ", i, ": ", mean(ig_frame$IV[((i - 1) * choose + 1):min((i * choose), 2216)]), " with ", mean(ig_frame$Splits[((i - 1) * choose + 1):min((i * choose), 2216)]), " splits - Found ", nrow(large_data), " rules (+", nrow(whatever), " rules). Max IV found was: ", max(ig_frame$IV[((i - 1) * choose + 1):min((i * choose), 2216)]), ". \n", sep = "")
gc(verbose = FALSE)
CurrentTime <- System$currentTimeMillis()
ETA <- ifelse(i == ceiling(2216 / choose), 0, (((2216 / choose) - i) * (CurrentTime - StartTime) / i / 1000))
pb_title <- paste("Mutual Information computation [CPU=", sprintf("%07.2f", (CurrentTime - StartTime) / 1000), "s | ETA=", sprintf("%07.2f", ETA), "s]", sep = "")
pb_iter <- ((CurrentTime - StartTime) / 1000) / (i * choose)
setWinProgressBar(pb, value = i * choose, title = pb_title, label = paste("[", format(Sys.time(), "%X"), " | ", sprintf("%04.2f", pb_iter), " s/iter] Doing feature ", colnames(data)[i * choose], " (", sprintf("%04d", i * choose), " / 2216 = ", sprintf("%05.2f", 100 * (i * choose) / 2216), "%)...", sep = ""))
}
Batch 1: 0.0154465 with 12.43333 splits - Found 373 rules (+373 rules). Max IV found was: 0.0552983.
Batch 2: 0.0247018 with 28.23333 splits - Found 1220 rules (+847 rules). Max IV found was: 0.05868686.
Batch 3: 0.01565962 with 28.8 splits - Found 2084 rules (+864 rules). Max IV found was: 0.04721482.
Batch 4: 0.02412576 with 69.13333 splits - Found 4158 rules (+2074 rules). Max IV found was: 0.06945033.
Batch 5: 0.01582699 with 67 splits - Found 6168 rules (+2010 rules). Max IV found was: 0.05667931.
Batch 6: 0.01183151 with 57.03333 splits - Found 7879 rules (+1711 rules). Max IV found was: 0.03446974.
Batch 7: 0.002596302 with 27.2 splits - Found 8695 rules (+816 rules). Max IV found was: 0.007706566.
Batch 8: 0.001305062 with 24.46667 splits - Found 9429 rules (+734 rules). Max IV found was: 0.005127166.
Batch 9: 0.003844629 with 48.06667 splits - Found 10871 rules (+1442 rules). Max IV found was: 0.01280566.
Batch 10: 0.002558682 with 24.7 splits - Found 11612 rules (+741 rules). Max IV found was: 0.007548303.
Batch 11: 0.01325514 with 49.4 splits - Found 13094 rules (+1482 rules). Max IV found was: 0.0787944.
Batch 12: 0.04184444 with 72.06667 splits - Found 15256 rules (+2162 rules). Max IV found was: 0.08536611.
Batch 13: 0.03745128 with 87.76667 splits - Found 17889 rules (+2633 rules). Max IV found was: 0.08519817.
Batch 14: 0.02331317 with 59.53333 splits - Found 19675 rules (+1786 rules). Max IV found was: 0.1888925.
Batch 15: 0.004385769 with 47.7 splits - Found 21106 rules (+1431 rules). Max IV found was: 0.01092735.
Batch 16: 0.006184872 with 56.96667 splits - Found 22815 rules (+1709 rules). Max IV found was: 0.02319138.
Batch 17: 0.000613954 with 9.1 splits - Found 23088 rules (+273 rules). Max IV found was: 0.003282883.
Batch 18: 0.0002594504 with 10.93333 splits - Found 23416 rules (+328 rules). Max IV found was: 0.003060614.
Batch 19: 0.0003178432 with 4.9 splits - Found 23563 rules (+147 rules). Max IV found was: 0.002242485.
Batch 20: 2.155617e-05 with 3.4 splits - Found 23665 rules (+102 rules). Max IV found was: 0.0005943852.
Batch 21: 0.0006674818 with 14.13333 splits - Found 24089 rules (+424 rules). Max IV found was: 0.003316824.
Batch 22: 0.0005191635 with 16.96667 splits - Found 24598 rules (+509 rules). Max IV found was: 0.002015877.
Batch 23: 0.02090894 with 46.16667 splits - Found 25983 rules (+1385 rules). Max IV found was: 0.08340856.
Batch 24: 0.01697376 with 50.4 splits - Found 27495 rules (+1512 rules). Max IV found was: 0.06174175.
Batch 25: 0.0845686 with 15.6 splits - Found 27963 rules (+468 rules). Max IV found was: 0.1879031.
Batch 26: 0.01331199 with 3.6 splits - Found 28071 rules (+108 rules). Max IV found was: 0.06149098.
Batch 27: 0.01865425 with 4.466667 splits - Found 28205 rules (+134 rules). Max IV found was: 0.1640065.
Batch 28: 0.07968504 with 4.133333 splits - Found 28329 rules (+124 rules). Max IV found was: 0.2856957.
Batch 29: 0.006640682 with 38.06667 splits - Found 29471 rules (+1142 rules). Max IV found was: 0.02194735.
Batch 30: 0.005559084 with 38.63333 splits - Found 30630 rules (+1159 rules). Max IV found was: 0.02037056.
Batch 31: 0.001645852 with 9.366667 splits - Found 30911 rules (+281 rules). Max IV found was: 0.01401855.
Batch 32: 2.752827e-05 with 1.133333 splits - Found 30945 rules (+34 rules). Max IV found was: 0.0001463404.
Batch 33: 0.0001115851 with 1.033333 splits - Found 30976 rules (+31 rules). Max IV found was: 0.002565435.
Batch 34: 0.0004295566 with 1.266667 splits - Found 31014 rules (+38 rules). Max IV found was: 0.002138272.
Batch 35: 0.007773453 with 1.433333 splits - Found 31057 rules (+43 rules). Max IV found was: 0.02702745.
Batch 36: 7.379473e-05 with 1.033333 splits - Found 31088 rules (+31 rules). Max IV found was: 0.0006908366.
Batch 37: 0.001977097 with 1.2 splits - Found 31124 rules (+36 rules). Max IV found was: 0.01425203.
Batch 38: 0.02751318 with 1.433333 splits - Found 31167 rules (+43 rules). Max IV found was: 0.4926713.
Batch 39: 0.1379648 with 273.2333 splits - Found 39364 rules (+8197 rules). Max IV found was: 0.57725.
Batch 40: 0.05938486 with 223.6333 splits - Found 46073 rules (+6709 rules). Max IV found was: 0.3370565.
Batch 41: 0.03602823 with 129.7333 splits - Found 49965 rules (+3892 rules). Max IV found was: 0.136041.
Batch 42: 0.007719532 with 53.93333 splits - Found 51583 rules (+1618 rules). Max IV found was: 0.1369454.
Batch 43: 0.2448408 with 195.0667 splits - Found 57435 rules (+5852 rules). Max IV found was: 1.032843.
Batch 44: 0.1329912 with 194.4 splits - Found 63267 rules (+5832 rules). Max IV found was: 1.066158.
Batch 45: 0.0001513323 with 1 splits - Found 63297 rules (+30 rules). Max IV found was: 0.00453997.
Batch 46: 0.008592273 with 1 splits - Found 63327 rules (+30 rules). Max IV found was: 0.02687168.
Batch 47: 0.0004848828 with 1.033333 splits - Found 63358 rules (+31 rules). Max IV found was: 0.008784763.
Batch 48: 0.02249145 with 1.133333 splits - Found 63392 rules (+34 rules). Max IV found was: 0.316063.
Batch 49: 0.1573172 with 335.5667 splits - Found 73459 rules (+10067 rules). Max IV found was: 0.57725.
Batch 50: 0.02712369 with 101.7 splits - Found 76510 rules (+3051 rules). Max IV found was: 0.1336369.
Batch 51: 0.007874745 with 63.53333 splits - Found 78416 rules (+1906 rules). Max IV found was: 0.1369454.
Batch 52: 0.1866265 with 179.6333 splits - Found 83805 rules (+5389 rules). Max IV found was: 1.066158.
Batch 53: 0.1477571 with 322.0667 splits - Found 93467 rules (+9662 rules). Max IV found was: 0.57725.
Batch 54: 0.02167264 with 94.36667 splits - Found 96298 rules (+2831 rules). Max IV found was: 0.3370565.
Batch 55: 0.02860292 with 119.1333 splits - Found 99872 rules (+3574 rules). Max IV found was: 0.1369454.
Batch 56: 0.06534473 with 74.36667 splits - Found 102103 rules (+2231 rules). Max IV found was: 0.8584556.
Batch 57: 0.2584039 with 328.7667 splits - Found 111966 rules (+9863 rules). Max IV found was: 1.066158.
Batch 58: 0.05401922 with 184.0333 splits - Found 117487 rules (+5521 rules). Max IV found was: 0.3370565.
Batch 59: 0.03316001 with 136.6333 splits - Found 121586 rules (+4099 rules). Max IV found was: 0.1369454.
Batch 60: 0.001645289 with 19.63333 splits - Found 122175 rules (+589 rules). Max IV found was: 0.006712436.
Batch 61: 0.2236541 with 222.7333 splits - Found 128857 rules (+6682 rules). Max IV found was: 1.066158.
Batch 62: 0.005863716 with 11.63333 splits - Found 129206 rules (+349 rules). Max IV found was: 0.02036483.
Batch 63: 0.01199258 with 12.26667 splits - Found 129574 rules (+368 rules). Max IV found was: 0.03640654.
Batch 64: 0.002771426 with 7.366667 splits - Found 129795 rules (+221 rules). Max IV found was: 0.01063805.
Batch 65: 0.05406482 with 95.53333 splits - Found 132661 rules (+2866 rules). Max IV found was: 0.3632893.
Batch 66: 0.01328544 with 50.66667 splits - Found 134181 rules (+1520 rules). Max IV found was: 0.08681855.
Batch 67: 0.004392374 with 25.1 splits - Found 134934 rules (+753 rules). Max IV found was: 0.03496181.
Batch 68: 0.007772502 with 17.76667 splits - Found 135467 rules (+533 rules). Max IV found was: 0.03088115.
Batch 69: 0.0384596 with 26 splits - Found 136247 rules (+780 rules). Max IV found was: 0.4160906.
Batch 70: 0.006765668 with 14.9 splits - Found 136694 rules (+447 rules). Max IV found was: 0.1606605.
Batch 71: 0.0003422376 with 1 splits - Found 136724 rules (+30 rules). Max IV found was: 0.001665384.
Batch 72: 0.008600526 with 1 splits - Found 136754 rules (+30 rules). Max IV found was: 0.02687168.
Batch 73: 0.001286627 with 1 splits - Found 136784 rules (+30 rules). Max IV found was: 0.01386439.
Batch 74: 0.03094415 with 6.346154 splits - Found 136949 rules (+165 rules). Max IV found was: 0.316063.
registerDoSEQ()
stopCluster(mcl)
closeAllConnections()
invisible(close(pb))
gc(verbose = TRUE)
Garbage collection 165 = 37+8+120 (level 2) ...
103.1 Mbytes of cons cells used (60%)
18699.2 Mbytes of vectors used (63%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1929088 103.1 3205452 171.2 2637877 140.9
Vcells 2450938346 18699.2 3910687954 29836.2 2572595015 19627.4
cat("Computation time: ", sprintf("%07.2f", (System$currentTimeMillis() - StartTime) / 1000), "s. \n", sep = "")
Computation time: 6280.98s.
cat("Job over on ", format(Sys.time(), "%a %b %d %Y %X"), ". \n", sep = "")
Job over on Thu Oct 13 2016 10:04:39 PM.
mass_data <- large_data
mass_data$class <- as.character(mass_data$class)
fwrite(mass_data, "E:/Laurae/NumericCMI_exact_best_grid/IG_IV_splits.csv")
all_frame <- ig_frame
all_frame$RankIV <- (nrow(all_frame) + 1) - rank(all_frame$IV, ties.method = "max")
all_frame$Number <- NULL
gc(verbose = TRUE)
Garbage collection 166 = 37+8+121 (level 2) ...
103.1 Mbytes of cons cells used (60%)
18699.3 Mbytes of vectors used (63%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1929641 103.1 3205452 171.2 2711633 144.9
Vcells 2450943581 18699.3 3910687954 29836.2 2572595015 19627.4
fwrite(all_frame, "E:/Laurae/NumericCMI_exact_best_grid/IG_IV_scores.csv")
Leak reference: IV = 2.843399 (super large!!!) - 3 splits: [-Inf, -1.5), [-1.5, 164), [164, +Inf]
datatable(all_frame,
filter = "top",
class = "cell-border stripe",
plugins = "natural",
extensions = c("AutoFill",
#"Buttons",
"ColReorder",
"KeyTable",
"Responsive",
"RowReorder"),
options = list(style = "bootstrap",
pageLength = 20,
lengthMenu = c(5, 10, 15, 20, 25, 50, 100, 250, 500, 1000, 2500),
order = list(list(7, "desc")),
autofill = TRUE,
#dom = "Bfrtip",
#buttons = c("copy", "csv", "excel", "pdf", "print"),
colReorder = TRUE,
keys = TRUE,
rowReorder = TRUE,
searchHighlight = TRUE,
search = list(regex = TRUE, caseInsensitive = FALSE))
) %>% formatStyle('IG',
background = styleColorBar(range(all_frame$IG, na.rm = TRUE, finite = TRUE), 'lightgreen'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('IV',
background = styleColorBar(range(all_frame$IV, na.rm = TRUE, finite = TRUE), 'pink'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('Splits',
background = styleColorBar(range(all_frame$Splits, na.rm = TRUE, finite = TRUE), 'yellow'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('NotMissing',
background = styleColorBar(c(0, 1), 'lightgrey'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('Levels',
background = styleColorBar(range(all_frame$Levels, na.rm = TRUE, finite = TRUE), 'lightgrey'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatRound(columns = c("IG"),
digits = 8) %>%
formatRound(columns = c("IV"),
digits = 8) %>%
formatPercentage(columns = c("NotMissing"),
digits = 4)
Table:
datatable(mass_data[mass_data$inflation >= 1, ],
filter = "top",
class = "cell-border stripe",
plugins = "natural",
extensions = c("AutoFill",
#"Buttons",
"ColReorder",
"KeyTable",
"Responsive",
"RowReorder"),
options = list(style = "bootstrap",
pageLength = 20,
lengthMenu = c(5, 10, 15, 20, 25, 50, 100, 250, 500, 1000, 2500),
order = list(list(13, "desc")),
autofill = TRUE,
#dom = "Bfrtip",
#buttons = c("copy", "csv", "excel", "pdf", "print"),
colReorder = TRUE,
keys = TRUE,
rowReorder = TRUE,
searchHighlight = TRUE,
search = list(regex = TRUE, caseInsensitive = FALSE))
) %>% formatStyle('woe',
background = styleColorBar(range(mass_data$woe, na.rm = TRUE, finite = TRUE), 'lightgreen'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('miv',
background = styleColorBar(range(mass_data$miv, na.rm = TRUE, finite = TRUE), 'pink'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('good',
background = styleColorBar(range(mass_data$good, na.rm = TRUE, finite = TRUE), 'yellow'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('bad',
background = styleColorBar(range(mass_data$bad, na.rm = TRUE, finite = TRUE), 'orange'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('pct_good',
background = styleColorBar(c(0, 1), 'yellow'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('pct_bad',
background = styleColorBar(c(0, 1), 'orange'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('bad_rate',
background = styleColorBar(c(0, 1), 'orange'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('pct_bin',
background = styleColorBar(c(0, 1), 'lightgrey'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('inflation',
background = styleColorBar(range(mass_data$inflation, na.rm = TRUE, finite = TRUE), 'lightgrey'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatRound(columns = c("woe"),
digits = 6) %>%
formatRound(columns = c("miv"),
digits = 6) %>%
formatRound(columns = c("odds"),
digits = 6) %>%
formatRound(columns = c("inflation"),
digits = 6) %>%
formatPercentage(columns = c("pct_bin"),
digits = 4) %>%
formatPercentage(columns = c("bad_rate"),
digits = 4) %>%
formatPercentage(columns = c("pct_good"),
digits = 4) %>%
formatPercentage(columns = c("pct_bad"),
digits = 4)
It seems your data is too big for client-side DataTables. You may consider server-side processing: http://rstudio.github.io/DT/server.htmlIt seems your data is too big for client-side DataTables. You may consider server-side processing: http://rstudio.github.io/DT/server.html